library(RMySQL)
library(tidyverse) #Data Manipulation and Plotting
library(lubridate) #Date Manipulation
library(arulesSequences) #Running the Sequence mining algorithm
library(ggtext) #Making adding some flair to plots
library(tidygraph)  ## Creating a Graph Structure
library(ggraph) ## Plotting the Network Graph Structure
library(arules)
library(arulesViz)
library(arulesSequences)
library(redivis)

MimicIII data

MIMIC-III documentation: https://mimic.mit.edu/docs/iii/

Association Rule Learning

Association rule learning is a rule-based machine learning method for discovering interesting relations between variables in large databases. It is intended to identify strong rules discovered in databases. In any given transaction with a variety of items, association rules are meant to discover the rules that determine how or why certain items are connected.

Definition:

  • Transactions: \(D=\{t_1, t_2, ..., t_m\}\)

  • Items: \(I=\{i_1, i_2, ..., i_n\}\)

  • Rule: \(X \Rightarrow Y\), X and Y are both sets of items, also known as itemsets

    • X: antecedent or left-hand-side (LHS)

    • Y: consequent or right-hand-side (RHS)

    This simply implies that, in theory, whenever X occurs in a dataset, then Y will occur as well.

Measures of interestingness:

  • Support: Support is an indication of how frequently the itemset appears in the dataset. The support of X with respect to T is defined as the proportion of transactions in the dataset which contains the itemset X.

    \(\text{support of X} = P(X) = \frac{\text{# of transactions containg X}}{\text{total # of transactions}} = \frac{|\{(i, t)\in T:X \subseteq t\}|}{|T|}\)

  • Confidence: Confidence is the percentage of all transactions satisfying X that also satisfy Y. With respect to T, the confidence value of an association rule \(X \Rightarrow Y\), is the ratio of transactions containing both X and Y to the total amount of X values present, where X is the antecedent and Y is the consequent.

    \(conf(X \Rightarrow Y) = P(Y|X) = \frac{supp(X \cup Y)}{supp(X)} = \frac{\text{# of transactions containg X and Y}}{\text{# of transactions containg X}}\)

  • Lift: Lift is defined as the ratio of the observed support to that expected if X and Y were independent. The value of lift indacates the degree to which the two itemsets are dependent on one another.

    \(lift(X \Rightarrow Y) = \frac{supp(X \cup Y)}{supp(X) \times supp(Y)}\)

    • lift > 1: the degree to which those two occurrences are dependent on one another, and makes those rules potentially useful for predicting the consequent in future data sets.

    • lift < 1: the items are substitute to each other. This means that presence of one item has negative effect on presence of other item and vice versa.

By setting the thresholds of Support and Confidence, we can find out the association rules which can satisfy the specified minimum support and confidence at the same time.

Usually, the Association rule generation is split into two different steps that needs to be applied:

  • A minimum Support threshold to find all the frequent itemsets that are in the database.

  • A minimum Confidence threshold to the frequent itemsets found to create rules.

Notably, this simple algorithm can help find the shared associations of diseases between patients, but ignore the timing information of diagnosis.

Data Processing

Table used here: ADMISSIONS

Official documents: https://mimic.mit.edu/docs/iii/tables/admissions/

In this instance, data are extracted from Redivis database via a private api token and processed in the steps below:

  • delete patients who only have one admission record as “NEWBORN”;

  • delete patients who only have one diagnosis record;

  • reformat the dataset into a dataframe with column:(subject_id, diagnosis);

  • convert the structure of dataset from dataframe to transactions.

source("config.R")

admissions <- redivis::user("graph_ai")$
  dataset("mimic_iii_clinical_data:9rva:next")$
  table("admissions:k1ek")$
  to_tibble() 
ad_df <- admissions %>% 
  janitor::clean_names() %>%
  filter(diagnosis != "NEWBORN") %>% 
  mutate(
    admittime = as.Date(admittime)
  ) %>% 
  arrange(subject_id, admittime) %>% 
  select(subject_id, admittime, diagnosis) %>% 
  separate_rows(diagnosis, sep = ";") %>% 
  filter(diagnosis != "") %>% 
  filter(diagnosis != " ") 

ad_count <- ad_df %>% 
  group_by(subject_id) %>% 
  summarise(n = n()) %>% 
  arrange(desc(n))

ad_long_df <- ad_df %>%
  left_join(ad_count, by = "subject_id") %>% 
  filter(n > 1) %>% 
  select(-n, -admittime)

ad_long_df %>% filter(subject_id == 109) %>% head() %>% knitr::kable()
subject_id diagnosis
109 HYPERTENSIVE EMERGENCY
109 HTN CRISIS
109 HYPERTENSION
109 HYPERTENSIVE URGENCY
109 HYPERTENSIVE URGENCY
109 HYPERTENSIVE URGENCY
ad_trans <- transactions(ad_long_df, 
                         format = "long", 
                         cols = c("subject_id", "diagnosis"))

Have a quick view of the transactions:

ad_trans %>% as("data.frame") %>% as_tibble() %>% head() %>% knitr::kable()
items transactionID
{ MINIMALLY INVASIVE RE-DO/SDA,AORTIC INSUFFICIENCYVALVE REPLACEMENT} 100
{END STAGE KIDNEY DISEASE,END STAGE LIVER DISEASE} 10000
{ICH,S/P FALL,SUBDURAL HEMATOMA,SEIZURE,C3 FRACTURE} 10004
{ REPAIR ASCENDING AORTIC ANEURYSM/SDA,ASCENDING AORTIC ANEURYSMVALVE REPLACEMENT} 10007
{ MITRAL REGURGITATION,CORONARY ARTERY DISEASEARTERY BYPASS GRAFT WITH MVR ? MITRAL VALVE REPLACEMENT /SDA} 10027
{SYNCOPE,TELEMETRY} 10029
arules::itemFrequencyPlot(ad_trans, 
                          topN = 20,
                          main = 'Absolute Item Frequency Plot',
                          type = "absolute",
                          ylab = "Item Frequency (Absolute)")

arules::itemFrequencyPlot(ad_trans, 
                          topN = 20,
                          main = 'Relative Item Frequency Plot',
                          type = "relative",
                          ylab = "Item Frequency (Relative)")

Pattern Mining: arules

rules <- apriori(
  ad_trans,
  parameter = list(supp = 0.002, conf = 0.2)
  )
## Apriori
## 
## Parameter specification:
##  confidence minval smax arem  aval originalSupport maxtime support minlen
##         0.2    0.1    1 none FALSE            TRUE       5   0.002      1
##  maxlen target  ext
##      10  rules TRUE
## 
## Algorithmic control:
##  filter tree heap memopt load sort verbose
##     0.1 TRUE TRUE  FALSE TRUE    2    TRUE
## 
## Absolute minimum support count: 24 
## 
## set item appearances ...[0 item(s)] done [0.00s].
## set transactions ...[8239 item(s), 12244 transaction(s)] done [0.01s].
## sorting and recoding items ... [147 item(s)] done [0.00s].
## creating transaction tree ... done [0.00s].
## checking subsets of size 1 2 3 done [0.00s].
## writing ... [36 rule(s)] done [0.00s].
## creating S4 object  ... done [0.00s].
as(rules, "data.frame") %>% as_tibble() %>% 
  select(rules, support, confidence, lift, count) %>% 
  arrange(desc(confidence)) %>% 
  head(10) %>% 
  knitr::kable()
rules support confidence lift count
{TELEMETRY,TRANSIENT ISCHEMIC ATTACK} => {STROKE} 0.0279320 1.0000000 23.728682 342
{STROKE,TRANSIENT ISCHEMIC ATTACK} => {TELEMETRY} 0.0279320 0.9606742 7.198589 342
{TRANSIENT ISCHEMIC ATTACK} => {STROKE} 0.0290755 0.9468085 22.466518 356
{TRANSIENT ISCHEMIC ATTACK} => {TELEMETRY} 0.0279320 0.9095745 6.815685 342
{CHRONIC OBST PULM DISEASE,COPD EXACERBATION} => {ASTHMA} 0.0022052 0.8709677 45.186987 27
{STROKE} => {TELEMETRY} 0.0366710 0.8701550 6.520305 449
{PYELONEPHRITIS} => {URINARY TRACT INFECTION} 0.0109441 0.8535032 37.863380 134
{COPD EXACERBATION,PNEUMONIA} => {ASTHMA} 0.0029402 0.8372093 43.435554 36
{COPD EXACERBATION} => {ASTHMA} 0.0138027 0.8284314 42.980143 169
{ASTHMA,PNEUMONIA} => {COPD EXACERBATION} 0.0029402 0.8000000 48.015686 36
plot(rules, method = NULL, measure = "support", shading = "lift", 
     interactive = TRUE, 
     engine = "htmlwidget")
topsubRules <- head(rules, n = 20, by = "confidence")

plot(topsubRules, method = "graph",  engine = "htmlwidget")
plot(topsubRules, method="paracoord")

Sequential Pattern Mining

Sequential Pattern Mining is a topic of Data Mining concerned with finding statistically relevant patterns between data examples where the values are delivered in a sequence.

The sequence cares about the order of events but do not need not a notion of time.

Common algorithms for sequential pattern mining:

  • GSP (Generalized Sequential Pattern Mining)

  • SPADE (Sequential Pattern Discovery using Equivalence Class): A vertical format sequential pattern mining method which identifies each element in each sequence in a dataset with a Sequence ID (SID) and the Element ID (EID)

  • Non-Apriori Based Algorithms

  • PrefixSpan (Prefix-projected Sequential Pattern Mining)

Data Processing

Concept:

  • Sequence: An ordered list of itemsets.

  • Event: The element of a sequence.

  • Item: The element of an event.

Target structure of the dataframe which can be converted to class of transactions:

  • items: The diagnosis within one admission

  • transactionID: The identifier of transaction

  • sequenceID: Patient identifier

  • eventID: Event identifier

  • classID (optional): class information of patients

ad_df <- admissions %>%
  janitor::clean_names() %>%
  mutate(
    admittime = as.Date(admittime), 
    trans_id = row_number()
  ) %>%
  arrange(subject_id, admittime) %>%
  select(subject_id, diagnosis, trans_id) %>%
  group_by(subject_id) %>%
  mutate(
    event_id = row_number()
    ) %>%
  ungroup() %>% 
  separate_rows(diagnosis, sep = ";") %>%
  filter(diagnosis != "") %>%
  filter(diagnosis != " ") %>% 
  group_by(subject_id, trans_id, event_id) %>% 
  summarise(items = list(diagnosis))
  

ad_count <- ad_df %>%
  group_by(subject_id) %>%
  summarise(n = n()) %>%
  arrange(desc(n))

ad_long_df <- ad_df %>%
  left_join(ad_count, by = "subject_id") %>%
  filter(n > 1) %>%
  select(-n) %>% 
  arrange(subject_id, event_id)

item_list = as.list(ad_long_df$items)
names(item_list) = c(1:length(item_list))


ad_trans <-  as(item_list, "transactions")
transactionInfo(ad_trans)$sequenceID <- ad_long_df$subject_id
transactionInfo(ad_trans)$eventID = ad_long_df$event_id

ad_trans %>% as("data.frame") %>% as_tibble() %>% head(20) %>%  knitr::kable()
items transactionID sequenceID eventID
{PATIENT FORAMEN OVALE PATENT FORAMEN OVALE MINIMALLY INVASIVE/SDA} 1 17 1
{PERICARDIAL EFFUSION} 2 17 2
{CONGESTIVE HEART FAILURE} 3 21 1
{SEPSIS} 4 21 2
{CORONARY ARTERY DISEASEARTERY BYPASS GRAFT/SDA} 5 23 1
{BRAIN MASS} 6 23 2
{CHEST PAIN} 7 34 1
{BRADYCARDIA} 8 34 2
{CORONARY ARTERY DISEASEARTERY BYPASS GRAFT /SDA} 9 36 1
{CHEST PAIN/SHORTNESS OF BREATH} 10 36 2
{VENTRAL HERNIA/SDA} 11 36 3
{NON-HODGKIN’S LYMPHOMIAMARROW TRANSPLANT} 12 61 1
{FEBRILE,NEUTROPENIA,NON-HODGKINS LYMPHOMA} 13 61 2
{INCISIONAL HERNIA} 14 67 1
{SUBARACHNOID HEMORRHAGE} 15 67 2
{PNEUMONIA} 16 68 1
{WEAKNESS} 17 68 2
{MEDIAL PARIETAL TUMOR/SDA} 18 84 1
{GLIOBLASTOMA,NAUSEA} 19 84 2
{AORTIC STENOSISCATH} 20 85 1
arules::itemFrequencyPlot(ad_trans, 
                          topN = 20,
                          main = 'Absolute Item Frequency Plot',
                          type = "absolute",
                          ylab = "Item Frequency (Absolute)")

arules::itemFrequencyPlot(ad_trans, 
                          topN = 20,
                          main = 'Relative Item Frequency Plot',
                          type = "relative",
                          ylab = "Item Frequency (Relative)")

Pattern Mining

itemsets <- cspade(ad_trans, 
                   parameter = list(support = 0.001), 
                   control = list(verbose = FALSE))

# as(itemsets, "data.frame") %>% as_tibble() %>% 
#   arrange(desc(support)) %>% 
#   slice_max(support, n = 20) %>% 
#   ggplot(aes(x = fct_reorder(sequence, support),
#                     y = support,
#                     fill = sequence)) + 
#     geom_col() + 
#     geom_label(aes(label = support %>% scales::percent()), hjust = 0.5) + 
#     labs(
#       x = "Diagnosis", 
#       y = "Support", 
#       title = "Diagnosis Frequency") + 
#     scale_fill_discrete(guide = F) +
#     scale_y_continuous(labels = scales::percent,
#                        expand = expansion(mult = c(0, .1))) + 
#     coord_flip() + 
#     cowplot::theme_cowplot() + 
#     theme(
#       axis.text=element_text(size=6), 
#       plot.caption = element_markdown(hjust = 0),
#       plot.caption.position = 'plot',
#       plot.title.position = 'plot'
#     )

summary(itemsets)
## set of 674 sequences with
## 
## most frequent items:
##                PNEUMONIA                   ASTHMA                TELEMETRY 
##                       86                       71                       67 
## CONGESTIVE HEART FAILURE        COPD EXACERBATION                  (Other) 
##                       60                       58                      730 
## 
## most frequent elements:
##                {PNEUMONIA} {CONGESTIVE HEART FAILURE} 
##                         80                         52 
##                   {SEPSIS}                   {ASTHMA} 
##                         50                         47 
##                {TELEMETRY}                    (Other) 
##                         41                        745 
## 
## element (sequence) size distribution:
## sizes
##   1   2   3   4   5 
## 310 299  62   2   1 
## 
## sequence length distribution:
## lengths
##   1   2   3   4   5 
## 270 306  74  20   4 
## 
## summary of quality measures:
##     support        
##  Min.   :0.001063  
##  1st Qu.:0.001196  
##  Median :0.001728  
##  Mean   :0.004203  
##  3rd Qu.:0.003588  
##  Max.   :0.113090  
## 
## includes transaction ID lists: FALSE 
## 
## mining info:
##      data ntransactions nsequences support
##  ad_trans         19958       7525   0.001
rules <- ruleInduction(itemsets, 
                       confidence = 0.1, 
                       control = list(verbose = FALSE))

# inspect(head(rules)) 

as(rules, "data.frame") %>% as_tibble() %>% 
  arrange(desc(confidence)) %>% 
  head() %>% 
  knitr::kable()
rule support confidence lift
<{DIABETIC KETOACIDOSIS,TELEMETRY}> => <{DIABETIC KETOACIDOSIS}> 0.0010631 0.8000000 44.26471
<{COPD EXACERBATION},{ASTHMA,CHRONIC OBST PULM DISEASE}> => <{ASTHMA}> 0.0013289 0.6666667 33.44444
<{HYPERGLYCEMIA},{DIABETIC KETOACIDOSIS}> => <{DIABETIC KETOACIDOSIS}> 0.0010631 0.6666667 36.88725
<{ASTHMA,COPD EXACERBATION},{ASTHMA,CHRONIC OBST PULM DISEASE}> => <{ASTHMA}> 0.0010631 0.6666667 33.44444
<{COPD EXACERBATION},{CHRONIC OBST PULM DISEASE}> => <{ASTHMA}> 0.0017276 0.6190476 31.05556
<{COPD EXACERBATION},{ASTHMA,CHRONIC OBST PULM DISEASE}> => <{COPD EXACERBATION}> 0.0011960 0.6000000 33.69403
ad_df <- admissions %>% 
  janitor::clean_names() %>%
  # filter(subject_id==109) %>% 
  filter(diagnosis != "NEWBORN") %>% 
  mutate(
    admittime = as.POSIXct(admittime, format = "%Y-%m-%d %H:%M:%S")
  ) %>% 
  arrange(subject_id, admittime) %>% 
  select(subject_id, admittime, diagnosis) %>% 
  separate_rows(diagnosis, sep = ";") %>% 
  filter(diagnosis != "") %>% 
  filter(diagnosis != " ") %>% 
  group_by(subject_id) %>% 
  arrange(admittime) %>% 
   mutate(time_diff = admittime-lag(admittime),
         new_segment = if_else(is.na(time_diff) | time_diff >= 60*60*24*395*100, 1, 0),
         event_id = cumsum(new_segment), 
    ) %>% 
  ungroup() %>% 
  group_by(subject_id, event_id) %>% 
  summarise(
    items = list(diagnosis)
  )

ad_count <- ad_df %>% 
  group_by(subject_id) %>% 
  summarise(n = n()) %>% 
  arrange(desc(n))

ad_long_df <- ad_df %>%
  left_join(ad_count, by = "subject_id") %>% 
  filter(n > 1) %>% 
  select(-n)

ad_long_df %>% head(10) %>% knitr::kable()

item_list <- as.list(ad_long_df$items)
names(item_list) = c(1:length(item_list))

ad_trans <-  as(item_list, "transactions")
transactionInfo(ad_trans)$sequenceID <- ad_long_df$subject_id
transactionInfo(ad_trans)$eventID = ad_long_df$event_id

ad_trans_df <- ad_trans %>% as("data.frame") %>% as_tibble()

head(ad_trans_df, 10) %>% knitr::kable()

itemsets <- cspade(ad_trans, 
                   parameter = list(support = 0.001), 
                   control = list(verbose = FALSE))

as(itemsets, "data.frame") %>% as_tibble() %>% 
  arrange(desc(support)) %>% 
  slice_max(support, n = 20) %>% 
  ggplot(aes(x = fct_reorder(sequence, support),
                    y = support,
                    fill = sequence)) + 
    geom_col() + 
    geom_label(aes(label = support %>% scales::percent()), hjust = 0.5) + 
    labs(
      x = "Diagnosis", 
      y = "Support", 
      title = "Diagnosis Frequency") + 
    scale_fill_discrete(guide = F) +
    scale_y_continuous(labels = scales::percent,
                       expand = expansion(mult = c(0, .1))) + 
    coord_flip() + 
    cowplot::theme_cowplot() + 
    theme(
      axis.text=element_text(size=6), 
      plot.caption = element_markdown(hjust = 0),
      plot.caption.position = 'plot',
      plot.title.position = 'plot'
    )

summary(itemsets)

rules <- ruleInduction(itemsets, 
                       confidence = 0.1, 
                       control = list(verbose = FALSE))

# inspect(head(rules)) 

as(rules, "data.frame") %>% as_tibble() %>% 
  arrange(desc(support)) %>% 
  head(20) %>% 
  knitr::kable()